{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transformers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder\n",
    "from sklearn.preprocessing import MinMaxScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.90453403,  0.        ,  1.        ,  0.        ,  0.        ],\n",
       "       [-1.50755672,  1.41421356,  1.        ,  0.        ,  0.        ],\n",
       "       [-0.30151134,  0.        ,  0.        ,  1.        ,  0.        ],\n",
       "       [ 0.90453403, -1.41421356,  0.        ,  0.        ,  1.        ]])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "X = pd.DataFrame(\n",
    "    {'city': ['London', 'London', 'Paris', 'Sallisaw'],\n",
    "     'title': [\"His Last Bow\", \"How Watson Learned the Trick\",\n",
    "               \"A Moveable Feast\", \"The Grapes of Wrath\"],\n",
    "     'expert_rating': [5, 3, 4, 5],\n",
    "     'user_rating': [4, 5, 4, 3]})\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.compose import make_column_selector,ColumnTransformer\n",
    "ct = ColumnTransformer([\n",
    "      ('scale', StandardScaler(),make_column_selector(dtype_include=np.number)),\n",
    "      ('onehot',OneHotEncoder(), make_column_selector(pattern='city', dtype_include=object))])\n",
    "ct.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>x</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2.5</td>\n",
       "      <td>y</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3.0</td>\n",
       "      <td>x</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     A  B\n",
       "0  1.0  x\n",
       "1  2.5  y\n",
       "2  3.0  x"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {\n",
    "    'A': [1, 2.5, 3],\n",
    "    'B': ['x', 'y', 'x']\n",
    "}\n",
    "df = pd.DataFrame(data)\n",
    "df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[1., 0., 0., 1., 0., 0.],\n",
       "        [0., 1., 0., 0., 1., 0.],\n",
       "        [0., 0., 1., 0., 0., 1.]])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print(type(LabelEncoder()))\n",
    "x = OneHotEncoder().fit_transform(df)\n",
    "type(x) # scipy.sparse.csr.csr_matrix\n",
    "x.todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0., 0.],\n",
       "       [1., 1.],\n",
       "       [2., 0.]])"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = OrdinalEncoder().fit_transform(df)\n",
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sk_trans = ColumnTransformer(transformers=[\n",
    "    ('min_max_A', MinMaxScaler(), ['A']),\n",
    "    ('label_encode_B', LabelEncoder(), ['B']),\n",
    "], remainder=\"passthrough\")\n",
    "\n",
    "sk_trans.fit_transform(df)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mlflow-dev-env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
